from bertopic import BERTopic
import cudf as pd
from sentence_transformers import SentenceTransformer
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.preprocessing.text.stem import PorterStemmer
from cuml import IncrementalPCA
from cuml.random_projection import SparseRandomProjection
import seaborn as sns
import statsmodels.api as sm
import numpy as np
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(10,10)})
from pprint import pprint
/home/jcosme/miniconda3/envs/naba/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
# input file
input_filename = './output_files/naba_data_removed_duplicates.csv'
# some column names
txt_col = "Copy/ Paste An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic."
label_col = 'Recipient'
# output files
norm_topic_count_filename = './output_files/norm_topic_count_df.csv'
decrease_chances_sencs_filename = './output_files/decrease_chances_sencs.csv'
increase_chances_sencs_filename = './output_files/increase_chances_sencs.csv'
bottom_k_sencs_filename = './output_files/bottom_k_sencs.csv'
top_k_sencs_filename = './output_files/top_k_sencs.csv'
random_seed = 42
sentence_model = SentenceTransformer(
"all-mpnet-base-v2",
device="cuda",
)
umap = IncrementalPCA(
n_components=1,
whiten=True,
)
topic_model = BERTopic(
embedding_model=sentence_model,
top_n_words=6,
diversity=1,
n_gram_range=(1, 1),
min_topic_size=10,
nr_topics='auto',
umap_model=umap,
)
p_alpha = 0.1
k_topics = 5
df = pd.read_csv(input_filename)
df.head()
| app_id | Gender:* Required fields are indicated with red symbol Permanent Contact Information | City (Permanent):* Required fields are indicated with red symbol Permanent Contact Information | State (Permanent):* Required fields are indicated with red symbol Permanent Contact Information | Zip (Permanent):* Required fields are indicated with red symbol Permanent Contact Information | Are you Black? (includes African, African American, Caribbean, etc.) | Preferred Mailing Address | College/University:Academic Profile | Classification (as of January 2022):Academic Profile | Major:Academic Profile | ... | Please provide details (i.e. company name, location, etc.) | I have accepted an internship for the summer of 2022 (June - August) | I have accepted an internship for the fall of 2022 (September- December) | Please provide details (i.e. company name, location, etc.).1 | I have accepted a permanent job offer | Please provide details (i.e. company name, location, etc.).2 | Have you received a CPA Exam Review? | Which CPA Exam Review have you received? | Copy/ Paste An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic. | Recipient | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Male | Brooklyn | NY | 11216 | Yes | Permanent | Medgar Evers College | Senior | Accounting | ... | <NA> | <NA> | <NA> | <NA> | No | <NA> | No | <NA> | During the early parts of 2020 one of the dead... | <NA> |
| 1 | 1 | Female | Bowie | MD | 20720 | Yes | Permanent | University of Maryland | Freshman | Business Management | ... | Received internship as a Summer 2022 Discovery... | Yes | No | <NA> | No | <NA> | No | <NA> | One of the most challenging times has been dur... | <NA> |
| 2 | 2 | Female | Chicago Heights | IL | 60411 | Yes | Permanent | North Carolina A&T State University | Freshman | Accounting | ... | <NA> | No | No | <NA> | No | <NA> | No | <NA> | When covid restrictions took place I was just ... | <NA> |
| 3 | 3 | Female | Chicago | IL | 60620 | Yes | Permanent | Loyola University Chicago | Sophomore | Accounting | ... | Ernst & Young, Chicago, Summer 2022 | Yes | No | Ernst & Young, Chicago, Summer 2022 | No | <NA> | No | <NA> | When the Covid-19 pandemic first began, I was ... | <NA> |
| 4 | 4 | Male | Baton Rouge | LA | 70806 | Yes | Campus/Temporary | Penn State University | Sophomore | Accounting | ... | <NA> | <NA> | <NA> | <NA> | No | <NA> | No | <NA> | Community disruptions such as Covid-19 and oth... | Yes |
5 rows × 37 columns
# correct the labels column
labels_df = df[label_col].copy()
labels_df
0 <NA>
1 <NA>
2 <NA>
3 <NA>
4 Yes
...
195 Yes
196 <NA>
197 <NA>
198 <NA>
199 Yes
Name: Recipient, Length: 200, dtype: object
labels_df[labels_df.notnull()] = 1.0
labels_df
0 <NA>
1 <NA>
2 <NA>
3 <NA>
4 1.0
...
195 1.0
196 <NA>
197 <NA>
198 <NA>
199 1.0
Name: Recipient, Length: 200, dtype: object
labels_df[labels_df.isna()] = 0.0
labels_df = labels_df.astype('float')
labels_df
0 0.0
1 0.0
2 0.0
3 0.0
4 1.0
...
195 1.0
196 0.0
197 0.0
198 0.0
199 1.0
Name: Recipient, Length: 200, dtype: float64
labels_df.value_counts()
1.0 110 0.0 90 Name: Recipient, dtype: int32
df[label_col] = labels_df
# normalize spaces: Remove extra whitespace between tokens and trim whitespace from the beginning and the end of each string.
df[txt_col] = df[txt_col].str.normalize_spaces()
# create a colum for how many words in each paragraph
df['txt_len'] = df[txt_col].str.split().list.len()
# replace any 'N/A' values with empty strings
total_na = df[txt_col].copy().isna().sum()
print(f"total empty count: {total_na}")
print(df.loc[df[txt_col].isna(), txt_col])
df.loc[df[txt_col].isna(), txt_col] = ''
total_na = df[txt_col].copy().isna().sum()
print(f"new total empty count: {total_na}")
total empty count: 1
Copy/ Paste An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic.
146 <NA>
new total empty count: 0
# df
# split each paragraph into sentences
sentences_df = df[txt_col].copy() + ' '
sentences_df = sentences_df.str.replace('Dr.', 'doctor', regex=False) # I saw this abbreviation so i'm just replacing it
senc_per_paragraph = sentences_df.str.split(pat='[!.?] ', regex=True).list.len()
sentences_df = sentences_df.str.split(pat='[!.?] ', regex=True, expand=True) # we assume sentences end in '.' '?' or '!' so we split on these
sentences_df = sentences_df.fillna('') # replace NA with empty strings
max_n_sencs = sentences_df.shape[1]
n_rows = sentences_df.shape[0]
sentences_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | During the early parts of 2020 one of the dead... | This pandemic has bedeviled me with many hurdl... | The most difficult barrier I've had to overcom... | As an international transfer student from Guya... | I only had the privilege to experience what it... | Never in my life prior to covid have I taken a... | Online classes for some courses required us to... | There were short comings during the trial-and-... | Getting simulated with the new covid-19 enviro... | As a very family-oriented person this was puni... | ... | ||||||||||
| 1 | One of the most challenging times has been dur... | It truly had an impact on my family emotionall... | My father lost his employment in an industry t... | This did have a significant burden on my family | Additionally, I developed Covid-19 this fall s... | This did have an impact on my GPA and caused m... | Currently, I am working two jobs, an on-campus... | Many times, it's been challenging balancing my... | However, I am a motivated student with aspirat... | If given the opportunity to be awarded a NABA ... | ... | ||||||||||
| 2 | When covid restrictions took place I was just ... | In March of 2020, that was the last time I wou... | Because of the coronavirus people began to los... | So, here I am about seventeen years old, worki... | I ended up quitting sports and extra activitie... | This was extremely hard for me because in Marc... | I lost a big part of my childhood and this for... | I eventually figured out what college I would ... | COVID set my family back a lot financially to ... | Being a first-generation college student, I di... | ... | ||||||||||
| 3 | When the Covid-19 pandemic first began, I was ... | In the Fall of 2020, I started college with th... | Starting my first year of college online made ... | Instead, I was faced with attending classes on... | With Zoom, it was a rocky start with getting u... | It was also much more difficult to really enjo... | There was also the difficulty with socializing | Not being able to be on campus made it very di... | I did join clubs, GroupMe chats, follow the so... | This made it difficult to really enjoy school,... | ... | ||||||||||
| 4 | Community disruptions such as Covid-19 and oth... | The Covid-19 pandemic has taught us that disru... | As such, we are to remain vigilant and always ... | Covid-19 has had a big impact on me | I have been affected financially, psychologica... | Like many other students, I painfully grappled... | I started university in Fall 2020 with two of ... | My Dad also lost his job during the beginning ... | There are times when I wonder about how my fam... | The hope and a tenacity for quality and succes... | ... | ||||||||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195 | There are many issues that are impacting the a... | I believe one of the most important issues is ... | When talking about, Covid-19 it has impacted m... | Challenges and barriers that I have had to ove... | Learning is an essential part of everyday life | It is the process behind acquiring a new under... | Technology is the sum of skills, methods, and ... | Learning and technology have not only transfor... | Personally, as I continue to progress in my co... | Whether that be studying by myself or learning... | ... | ||||||||||
| 196 | My name is Wunmi Surakat, a current 1st semest... | At the start of the COVID-19 pandemic, I was i... | I remembered clearly, I was on Spring Break in... | After spring break was over, all my classes we... | Learning online became the biggest challenge a... | For the past 18years of my life, I had In-pers... | The challenges range from staying focused whil... | The move to digital learning was filled with c... | Going from a structured day on campus to a com... | Getting into Zoom for lecture classes was the ... | ... | ||||||||||
| 197 | No one expected the Covid- 19 pandemic | Since it happened it has opened my eyes and ma... | The pandemic has affected my life in many ways... | One challenge that I had to face was my bigges... | I had a fear of going to college because at th... | After losing my mother at such an early age, I... | While taking on this responsibility, I always ... | After immigrating to the United States, I mana... | After being out of school for so many years, I... | My first semester was Spring 2020, and this wa... | ... | ||||||||||
| 198 | During the COVID-19 pandemic, my entire life c... | I made an 180 degree turn in my life, finally ... | I went from extremely motivated, bubbly, energ... | I moved from across the country to a universit... | I faced several low points in my life during t... | Around March 13th, 2020, I was required to mov... | Since Valdosta State University had become my ... | Thankfully my grandmother had an extra room at... | During the moving process, I was forced to sel... | This debacle left me car-less and temporarily ... | ... | ||||||||||
| 199 | The Covid-19 pandemic is something that has af... | As a recent graduate and being in the first se... | I came to college in 2018, so I only had one i... | When I first came to college, I was always the... | When my university went virtual, that took awa... | It was harder for professors to lecture in a w... | This transition from in-person to online affec... | From a professional perspective, during my int... | I found myself having to go the extra mile to ... | With this challenge, I have always managed to ... | ... |
200 rows × 43 columns
# stack all the sentences into one column
sentences_series = sentences_df.copy().stack().reset_index(drop=True)
sentences_series = sentences_series.str.normalize_spaces()
sentences_series
0 During the early parts of 2020 one of the dead...
1 This pandemic has bedeviled me with many hurdl...
2 The most difficult barrier I've had to overcom...
3 As an international transfer student from Guya...
4 I only had the privilege to experience what it...
...
8595
8596
8597
8598
8599
Length: 8600, dtype: object
# preprocess the sentences:
# 1. remove all non-letter character
# 2. remove all english 'stop words' (words like 'the' 'I' 'may' 'this' 'him' basically words that don't add meaning to sentences
# 3. stem all words (for example 'moving', 'moved', and 'move' would all be changed to 'mov'
# load word stemmer
stemmer = PorterStemmer()
# get stop words from tfidf
tfidf = TfidfVectorizer(stop_words='english')
stop_words = tfidf._get_stop_words() # stop words will be removed
del tfidf
def preproc_senc_srs(a_series):
preproc_sentences_series = a_series.copy()
# remove all non-alpbabet characters
preproc_sentences_series = preproc_sentences_series.str.normalize_spaces().str.lower().str.replace('[^A-Za-z\s]', '', regex=True)
# remove each stop word
for stop_word in stop_words:
preproc_sentences_series = preproc_sentences_series.str.replace(fr'\b{stop_word}\b', '', regex=True)
# stem each word
preproc_sentences_series = preproc_sentences_series.str.normalize_spaces()
preproc_sentences_series = preproc_sentences_series + ' -'
sen_lens = preproc_sentences_series.str.split().list.len()
sen_index = sen_lens.index.repeat(sen_lens)
preproc_sentences_series = stemmer.stem(preproc_sentences_series.str.tokenize()).str.detokenize(sen_index)
preproc_sentences_series = preproc_sentences_series.str.replace('-', '')
preproc_sentences_series = preproc_sentences_series.str.normalize_spaces()
return preproc_sentences_series
preproc_sentences_series = preproc_senc_srs(sentences_series)
preproc_sentences_series
0 earli part deadliest virus plagu world novel c...
1 pandem bedevil hurdl overcom drastic chang life
2 difficult barrier ive overcom get simul new co...
3 intern transfer student guyana extrem difficul...
4 privileg experi felt like attend class campu m...
...
8595
8596
8597
8598
8599
Length: 8600, dtype: object
# get the labels
y_series = sentences_df.copy().set_index(df[label_col])
y_series = y_series.stack().reset_index().sort_values('level_1')[preproc_sentences_series.str.len() != 0]['level_0'].astype(int)
y_series
0 0
43 0
86 0
129 0
172 1
..
7223 1
7266 1
7309 0
7352 1
7395 1
Name: level_0, Length: 4019, dtype: int64
# drop all empty rows for fitting model
docs = preproc_sentences_series[preproc_sentences_series.str.len() != 0]
docs
0 earli part deadliest virus plagu world novel c...
1 pandem bedevil hurdl overcom drastic chang life
2 difficult barrier ive overcom get simul new co...
3 intern transfer student guyana extrem difficul...
4 privileg experi felt like attend class campu m...
...
8567 feel unmotiv taken toll mental health way caus...
8568 overcom challeng thought barrier success push ...
8569 famili colleg path corpor world creat legaci f...
8570 continu strive new height daili reach potenti ...
8571 understand purpos challeng face allow experi m...
Length: 4019, dtype: object
# turn Series into a list; move from GPU to CPU
docs = docs.to_arrow().to_pylist()
y = y_series.to_arrow().to_pylist()
# fit the topic model
topic_model.fit(docs, y=y)
<bertopic._bertopic.BERTopic at 0x7efd8ae7deb0>
# show topics and topic counts
topic_model.get_topic_info()
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 690 | -1_student_covid_peopl_make |
| 1 | 0 | 852 | 0_pandem_school_did_year |
| 2 | 1 | 219 | 1_covid_chang_famili_know |
| 3 | 2 | 93 | 2_clinic_limit_safeti_wasnt |
| 4 | 3 | 74 | 3_kloot_amanda_wifi_neg |
| ... | ... | ... | ... |
| 70 | 69 | 13 | 69_offer_povertystricken_upcom_michigan |
| 71 | 70 | 12 | 70_encamp_stop_hope_technolog |
| 72 | 71 | 11 | 71_curfew_lower_embassi_tragic |
| 73 | 72 | 11 | 72_oneweek_explan_dormitori_preval |
| 74 | 73 | 10 | 73_scenic_save_teacher_well |
75 rows × 3 columns
n_topics = topic_model.get_topic_info().shape[0] - 1
n_topics
74
# plot topic distance map
topic_model.visualize_topics(width=1000, height=1000)
# save plot of topic distance map
topic_model.visualize_topics(width=1000, height=1000).write_html("./visualizations/visualize_topics.html")
# show words for each topic
topic_model.visualize_barchart(top_n_topics=topic_model.get_topic_info().shape[0], n_words=10, height=350, width=400)
# save plot of words for each topic
topic_model.visualize_barchart(top_n_topics=topic_model.get_topic_info().shape[0], n_words=10, height=350, width=400).write_html('./visualizations/visualize_barchart.html')
# label each sentence with a topic
docs = preproc_sentences_series.to_arrow().to_pylist()
doc_topics, topic_probs = topic_model.transform(docs)
# perform topic hierarchy analysis
docs = preproc_sentences_series[preproc_sentences_series.str.len() != 0]
docs = docs.to_arrow().to_pylist()
hierarchical_topics = topic_model.hierarchical_topics(docs)
100%|███████████████████████████████████████████| 73/73 [00:01<00:00, 45.56it/s]
# plot topic hierarchy
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
# save plot of topic hierarchy
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).write_html('./visualizations/visualize_heirarchy.html')
# create a dataframe that categorizes each sentence
topics_series = pd.Series(doc_topics).astype(str)
col_index = []
for i in range(n_rows):
col_index.extend([i] * max_n_sencs)
senc_topics_df = topics_series.str.detokenize(pd.Series(col_index)).str.split(expand=True)
senc_topics_df = senc_topics_df.rename(columns={x: f'senc_{str(x)}' for x in senc_topics_df.columns})
senc_topics_df
| senc_0 | senc_1 | senc_2 | senc_3 | senc_4 | senc_5 | senc_6 | senc_7 | senc_8 | senc_9 | ... | senc_33 | senc_34 | senc_35 | senc_36 | senc_37 | senc_38 | senc_39 | senc_40 | senc_41 | senc_42 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 8 | 0 | 3 | -1 | 0 | 0 | 4 | -1 | 28 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 1 | 5 | 3 | 0 | 40 | 36 | 0 | -1 | 0 | 0 | 4 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 2 | 0 | 20 | 0 | 24 | 23 | 0 | 0 | -1 | 24 | -1 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 3 | 16 | -1 | -1 | 21 | -1 | 4 | 69 | 72 | 31 | -1 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 4 | 1 | 1 | 8 | -1 | -1 | 0 | 0 | 0 | 70 | 7 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195 | -1 | 40 | 12 | 12 | 59 | 60 | 38 | 70 | -1 | 39 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 196 | 0 | 8 | 11 | 11 | -1 | 9 | 47 | 10 | 67 | 16 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 197 | 1 | 3 | 40 | 31 | 53 | 69 | 0 | 53 | 35 | 20 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 198 | 1 | 45 | 41 | 0 | 0 | 45 | 21 | 0 | -1 | 33 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 199 | 19 | 0 | 35 | 18 | 21 | 7 | 35 | 46 | 3 | 23 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
200 rows × 43 columns
# count of each topic per paragraphs
topic_count_df = pd.DataFrame(index=range(n_rows), columns=range(n_topics))
for i in range(n_rows):
temp_df = senc_topics_df.iloc[i].reset_index(drop=True).T.value_counts().sort_index().to_frame().T
temp_df = temp_df[temp_df.columns[1::]]
temp_df.columns = temp_df.columns.to_frame()[0].values
topic_count_df.loc[i, temp_df.columns.astype(int)] = temp_df.values.flatten()
topic_count_df = topic_count_df.fillna(0)
topic_count_df = topic_count_df.rename(columns={x: f'topic_{str(x)}' for x in topic_count_df.columns})
topic_count_df
| topic_0 | topic_1 | topic_2 | topic_3 | topic_4 | topic_5 | topic_6 | topic_7 | topic_8 | topic_9 | ... | topic_64 | topic_65 | topic_66 | topic_67 | topic_68 | topic_69 | topic_70 | topic_71 | topic_72 | topic_73 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 5 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 4 | 10 | 5 | 0 | 2 | 0 | 1 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
| 196 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 197 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 198 | 9 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 199 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
200 rows × 74 columns
# normalize each topic count by number of sentences in paragraphs
norm_topic_count_df = topic_count_df.astype('float')# + 1e-8
for a_col in norm_topic_count_df.columns:
norm_topic_count_df[a_col] = norm_topic_count_df[a_col] / senc_per_paragraph
norm_topic_count_df[norm_topic_count_df.isna()] = 0
norm_topic_count_df
| topic_0 | topic_1 | topic_2 | topic_3 | topic_4 | topic_5 | topic_6 | topic_7 | topic_8 | topic_9 | ... | topic_64 | topic_65 | topic_66 | topic_67 | topic_68 | topic_69 | topic_70 | topic_71 | topic_72 | topic_73 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.263158 | 0.052632 | 0.000000 | 0.052632 | 0.052632 | 0.000000 | 0.052632 | 0.000000 | 0.052632 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.052632 |
| 1 | 0.416667 | 0.000000 | 0.000000 | 0.083333 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
| 2 | 0.363636 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
| 3 | 0.055556 | 0.000000 | 0.000000 | 0.000000 | 0.055556 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.055556 | 0.000000 | 0.0 | 0.055556 | 0.000000 |
| 4 | 0.333333 | 0.166667 | 0.000000 | 0.066667 | 0.000000 | 0.033333 | 0.000000 | 0.033333 | 0.033333 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.033333 | 0.0 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195 | 0.043478 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.043478 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.086957 | 0.0 | 0.000000 | 0.000000 |
| 196 | 0.181818 | 0.045455 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.045455 | 0.000000 | 0.045455 | 0.045455 | ... | 0.000000 | 0.0 | 0.0 | 0.045455 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
| 197 | 0.173913 | 0.043478 | 0.000000 | 0.043478 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.043478 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
| 198 | 0.321429 | 0.035714 | 0.071429 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
| 199 | 0.125000 | 0.000000 | 0.000000 | 0.062500 | 0.000000 | 0.000000 | 0.000000 | 0.062500 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
200 rows × 74 columns
norm_topic_count_df.to_csv(norm_topic_count_filename, index=False)
# # change count to binary value (1 means topic is included, 0 means topic is not included)
# topic_count_df = topic_count_df.astype('float')
# topic_count_df[topic_count_df > 1] = 1
# topic_count_df
# norm_topic_count_df = topic_count_df.copy()
# norm_topic_count_df
# heatmap of percent of topics in a paragraph
sns.set(rc={'figure.figsize':(10,10)})
sns.heatmap(norm_topic_count_df.to_pandas())
<AxesSubplot: >
# sns.set(rc={'figure.figsize':(20,60)})
# sns.heatmap(norm_topic_count_df.to_pandas())
avg_norm_topic_count_df = norm_topic_count_df.sum(axis=0) / norm_topic_count_df.shape[0]
# avg_norm_topic_count_df
# heatmap of AVERAGE percent of topics in a paragraph
sns.set(rc={'figure.figsize':(10,1)})
sns.heatmap(avg_norm_topic_count_df.to_frame().T.to_pandas(), cmap=sns.color_palette("viridis", as_cmap=True))
<AxesSubplot: >
# heatmap of AVERAGE percent of topics in a paragraph, sorted by descending values
sns.set(rc={'figure.figsize':(1,20)})
sns.heatmap(avg_norm_topic_count_df.sort_values(ascending=False).to_frame().to_pandas(), cmap=sns.color_palette("viridis", as_cmap=True))
<AxesSubplot: >
# create a variable for the name of the column that contains the labels
labels_df = df[label_col].copy().astype(int)
labels_df
0 0
1 0
2 0
3 0
4 1
..
195 1
196 0
197 0
198 0
199 1
Name: Recipient, Length: 200, dtype: int64
# fit logistic regression model
y = labels_df.to_pandas()
X = norm_topic_count_df.to_pandas()
X = sm.add_constant(X)
# reg = sm.Logit(y, X).fit(maxiter=1000, method='bfgs')
reg = sm.Logit(y, X).fit_regularized(maxiter=1000)
Optimization terminated successfully (Exit mode 0)
Current function value: 0.4775846474931757
Iterations: 428
Function evaluations: 428
Gradient evaluations: 428
print(reg.summary())
Logit Regression Results
==============================================================================
Dep. Variable: Recipient No. Observations: 200
Model: Logit Df Residuals: 125
Method: MLE Df Model: 74
Date: Mon, 13 Feb 2023 Pseudo R-squ.: 0.3060
Time: 01:04:13 Log-Likelihood: -95.517
converged: True LL-Null: -137.63
Covariance Type: nonrobust LLR p-value: 0.1953
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0352 1.129 0.031 0.975 -2.177 2.247
topic_0 0.3246 2.522 0.129 0.898 -4.618 5.267
topic_1 5.3894 3.861 1.396 0.163 -2.178 12.957
topic_2 9.0569 8.173 1.108 0.268 -6.962 25.076
topic_3 -3.8147 7.115 -0.536 0.592 -17.761 10.131
topic_4 -3.5357 5.725 -0.618 0.537 -14.757 7.685
topic_5 1.8820 7.575 0.248 0.804 -12.964 16.728
topic_6 4.5438 8.578 0.530 0.596 -12.268 21.356
topic_7 -0.0788 8.996 -0.009 0.993 -17.711 17.554
topic_8 -6.5935 9.542 -0.691 0.490 -25.295 12.108
topic_9 -13.1527 8.152 -1.613 0.107 -29.131 2.825
topic_10 7.8642 10.357 0.759 0.448 -12.434 28.163
topic_11 2.6917 10.003 0.269 0.788 -16.913 22.297
topic_12 9.7982 10.979 0.892 0.372 -11.720 31.316
topic_13 28.5972 10.812 2.645 0.008 7.406 49.788
topic_14 3.4601 7.969 0.434 0.664 -12.160 19.080
topic_15 -30.7921 12.127 -2.539 0.011 -54.561 -7.024
topic_16 -11.4600 11.600 -0.988 0.323 -34.196 11.276
topic_17 9.5532 11.007 0.868 0.385 -12.020 31.127
topic_18 6.7012 9.965 0.672 0.501 -12.831 26.233
topic_19 -9.4377 10.969 -0.860 0.390 -30.936 12.060
topic_20 2.8007 10.154 0.276 0.783 -17.101 22.702
topic_21 12.3816 10.481 1.181 0.237 -8.161 32.925
topic_22 7.1321 12.010 0.594 0.553 -16.408 30.672
topic_23 -11.6635 10.404 -1.121 0.262 -32.055 8.729
topic_24 -11.2415 11.415 -0.985 0.325 -33.614 11.131
topic_25 21.9175 14.355 1.527 0.127 -6.217 50.052
topic_26 25.1285 11.879 2.115 0.034 1.845 48.412
topic_27 -6.0446 12.152 -0.497 0.619 -29.861 17.772
topic_28 -3.1171 14.661 -0.213 0.832 -31.852 25.618
topic_29 10.2555 14.768 0.694 0.487 -18.689 39.200
topic_30 3.7093 11.246 0.330 0.742 -18.332 25.751
topic_31 -1.1273 14.240 -0.079 0.937 -29.038 26.784
topic_32 -26.6997 12.947 -2.062 0.039 -52.074 -1.325
topic_33 9.2730 14.988 0.619 0.536 -20.103 38.649
topic_34 -19.0154 12.457 -1.526 0.127 -43.432 5.401
topic_35 -16.0840 12.210 -1.317 0.188 -40.015 7.847
topic_36 -10.1497 15.538 -0.653 0.514 -40.604 20.305
topic_37 -1.9879 13.719 -0.145 0.885 -28.876 24.901
topic_38 -16.6995 12.527 -1.333 0.182 -41.252 7.853
topic_39 23.6918 13.575 1.745 0.081 -2.914 50.298
topic_40 -16.4830 14.355 -1.148 0.251 -44.619 11.653
topic_41 3.1416 12.743 0.247 0.805 -21.835 28.118
topic_42 16.3925 13.812 1.187 0.235 -10.679 43.464
topic_43 2.4222 12.745 0.190 0.849 -22.558 27.403
topic_44 21.7783 13.720 1.587 0.112 -5.112 48.668
topic_45 18.1146 16.067 1.127 0.260 -13.376 49.606
topic_46 -15.9182 12.674 -1.256 0.209 -40.758 8.922
topic_47 0.2832 18.910 0.015 0.988 -36.780 37.346
topic_48 -9.6117 15.427 -0.623 0.533 -39.848 20.625
topic_49 -31.4390 17.949 -1.752 0.080 -66.618 3.740
topic_50 13.4718 17.336 0.777 0.437 -20.506 47.450
topic_51 5.9128 15.152 0.390 0.696 -23.786 35.611
topic_52 -16.6792 12.176 -1.370 0.171 -40.544 7.186
topic_53 12.0262 15.437 0.779 0.436 -18.229 42.281
topic_54 47.1848 18.663 2.528 0.011 10.605 83.764
topic_55 24.0242 15.755 1.525 0.127 -6.854 54.903
topic_56 25.8465 16.508 1.566 0.117 -6.508 58.201
topic_57 -34.1647 22.276 -1.534 0.125 -77.825 9.495
topic_58 -41.4946 16.714 -2.483 0.013 -74.254 -8.736
topic_59 -2.9710 19.350 -0.154 0.878 -40.897 34.955
topic_60 22.2631 18.808 1.184 0.237 -14.600 59.126
topic_61 26.5717 19.197 1.384 0.166 -11.053 64.196
topic_62 -9.0703 19.946 -0.455 0.649 -48.163 30.023
topic_63 -15.4792 22.815 -0.678 0.497 -60.195 29.237
topic_64 -42.6386 19.588 -2.177 0.029 -81.030 -4.247
topic_65 -13.5277 15.259 -0.887 0.375 -43.435 16.380
topic_66 -40.6069 18.546 -2.190 0.029 -76.956 -4.258
topic_67 -14.5593 15.986 -0.911 0.362 -45.891 16.772
topic_68 24.4555 23.471 1.042 0.297 -21.548 70.459
topic_69 -11.5715 17.619 -0.657 0.511 -46.103 22.960
topic_70 23.5534 21.905 1.075 0.282 -19.379 66.486
topic_71 37.7309 23.036 1.638 0.101 -7.419 82.881
topic_72 1.4405 19.341 0.074 0.941 -36.467 39.348
topic_73 -37.9072 26.900 -1.409 0.159 -90.630 14.816
==============================================================================
# get topics that are statistically significant
significant_topics_coeffs = reg.params[reg.pvalues < p_alpha]
significant_topics_coeffs
topic_13 28.597197 topic_15 -30.792088 topic_26 25.128523 topic_32 -26.699702 topic_39 23.691843 topic_49 -31.438978 topic_54 47.184751 topic_58 -41.494592 topic_64 -42.638564 topic_66 -40.606944 dtype: float64
# calculate odds ratios of significant topics
significant_topics_odds_ratios = np.exp(significant_topics_coeffs).sort_values()
significant_topics_odds_ratios
topic_64 3.036037e-19 topic_58 9.530766e-19 topic_66 2.315411e-18 topic_49 2.219348e-14 topic_15 4.238050e-14 topic_32 2.537855e-12 topic_39 1.946420e+10 topic_26 8.188022e+10 topic_13 2.627877e+12 topic_54 3.105109e+20 dtype: float64
# get odds_ratios of significant topics that decrease chances
decrease_chances_topics_odds_ratios = significant_topics_odds_ratios[significant_topics_odds_ratios < 1]
decrease_chances_topics_odds_ratios = decrease_chances_topics_odds_ratios.sort_values()
decrease_chances_topics_odds_ratios
topic_64 3.036037e-19 topic_58 9.530766e-19 topic_66 2.315411e-18 topic_49 2.219348e-14 topic_15 4.238050e-14 topic_32 2.537855e-12 dtype: float64
# get significant topics that decrease chances
decrease_chances_topics = decrease_chances_topics_odds_ratios.index.to_series().str.split('_').apply(lambda x: x[-1]).values.astype(int).tolist()
decrease_chances_topics = sorted(decrease_chances_topics)
decrease_chances_topics
[15, 32, 49, 58, 64, 66]
# plot significant topics that decrease chances
topic_model.visualize_barchart(topics=decrease_chances_topics, n_words=10, height=350, width=400)
# save plot of significant topics that decrease chances
topic_model.visualize_barchart(topics=decrease_chances_topics, n_words=10, height=350, width=400).write_html('./visualizations/barchart_decrease_chances_topics.html')
# get sentences in these topics and save a file
decrease_chances_sencs = sentences_df[senc_topics_df.astype(int).isin(decrease_chances_topics)].copy()
decrease_chances_sencs = decrease_chances_sencs.fillna('')
decrease_chances_sencs = decrease_chances_sencs.stack().reset_index(drop=True)
decrease_chances_sencs = decrease_chances_sencs[decrease_chances_sencs.str.len() > 0]
decrease_chances_sencs.to_frame().to_csv(decrease_chances_sencs_filename, index=False)
decrease_chances_sencs
200 However, I am grateful that although I was bat...
216 Due to the COVID-19 pandemic, my mother was st...
222 I was able to help pay rent and utilities by w...
303 Many people, especially in the black community...
497 All of these things greatly helped me to impro...
...
8228 I eventually had to return to my home country
8400 In order to stay ahead of the issues related t...
8440 I made sure my routine sticks as closely to my...
8490 Presently, I can say that I have overcome my b...
8524 My world was falling apart and there wasn't an...
Length: 142, dtype: object
# get odds_ratios of significant topics that increase chances
increase_chances_topics_odds_ratios = significant_topics_odds_ratios[significant_topics_odds_ratios > 1]
increase_chances_topics_odds_ratios = increase_chances_topics_odds_ratios.sort_values(ascending=False)
increase_chances_topics_odds_ratios
topic_54 3.105109e+20 topic_13 2.627877e+12 topic_26 8.188022e+10 topic_39 1.946420e+10 dtype: float64
# get significant topics that increase chances
increase_chances_topics = increase_chances_topics_odds_ratios.index.to_series().str.split('_').apply(lambda x: x[-1]).values.astype(int).tolist()
increase_chances_topics = sorted(increase_chances_topics)
increase_chances_topics
[13, 26, 39, 54]
# plot significant topics that increase chances
topic_model.visualize_barchart(topics=increase_chances_topics, n_words=10, height=350, width=400)
# save plot of significant topics that increase chances
topic_model.visualize_barchart(topics=increase_chances_topics, n_words=10, height=350, width=400).write_html('./visualizations/barchart_increase_chances_topics.html')
# get sentences in these topics and save a file
increase_chances_sencs = sentences_df[senc_topics_df.astype(int).isin(increase_chances_topics)].copy()
increase_chances_sencs = increase_chances_sencs.fillna('')
increase_chances_sencs = increase_chances_sencs.stack().reset_index(drop=True)
increase_chances_sencs = increase_chances_sencs[increase_chances_sencs.str.len() > 0]
increase_chances_sencs.to_frame().to_csv(increase_chances_sencs_filename, index=False)
increase_chances_sencs
142 It was very difficult for me to establish a ti...
230 My dedication to not only my family but also m...
369 Andy Rooney once said and I quote "everyone wa...
435 Classrooms collected dust while Zoom became th...
445 The halls I used to comfortably navigate, I no...
...
8319 With work, I have kept consistent communicatio...
8354 The summer after my freshman year was hectic
8394 Whether that be studying by myself or learning...
8569 I am the first one in my family to go to colle...
8570 I am continuing to strive to new heights daily...
Length: 125, dtype: object
# get the topics that decrease chances the most (ignoring statistical significance)
bottom_k_topics = reg.params.sort_values()[:k_topics].index.to_series().str.split('_').apply(lambda x: x[-1]).astype(int).tolist()
bottom_k_topics
[64, 58, 66, 73, 57]
# plot topics that decrease chances the most (ignoring statistical significance)
topic_model.visualize_barchart(topics=bottom_k_topics, n_words=10, height=350, width=400)
# save plot of significant topics that increase chances
topic_model.visualize_barchart(topics=bottom_k_topics, n_words=10, height=350, width=400).write_html('./visualizations/barchart_bottom_k_topics.html')
# get sentences in these topics and save a file
bottom_k_sencs = sentences_df[senc_topics_df.astype(int).isin(bottom_k_topics)].copy()
bottom_k_sencs = bottom_k_sencs.fillna('')
bottom_k_sencs = bottom_k_sencs.stack().reset_index(drop=True)
bottom_k_sencs = bottom_k_sencs[bottom_k_sencs.str.len() > 0]
bottom_k_sencs.to_frame().to_csv(bottom_k_sencs_filename, index=False)
bottom_k_sencs
12 I became more computer literate, got better at...
222 I was able to help pay rent and utilities by w...
522 I am getting this degree on my own
569 To manage these drastic changes and nightly re...
704 I overcame it
...
8223 I had limited options and no budget for unanti...
8228 I eventually had to return to my home country
8305 It was not an easy time for me during that period
8400 In order to stay ahead of the issues related t...
8440 I made sure my routine sticks as closely to my...
Length: 73, dtype: object
# get the topics that increase chances the most (ignoring statistical significance)
top_k_topics = reg.params.sort_values(ascending=False)[:k_topics].index.to_series().str.split('_').apply(lambda x: x[-1]).astype(int).tolist()
top_k_topics
[54, 71, 13, 61, 56]
# plot topics that increase chances the most (ignoring statistical significance)
topic_model.visualize_barchart(topics=top_k_topics, n_words=10, height=350, width=400)
# save plot of significant topics that increase chances
topic_model.visualize_barchart(topics=top_k_topics, n_words=10, height=350, width=400).write_html('./visualizations/barchart_top_k_topics.html')
# get sentences in these topics and save a file
top_k_sencs = sentences_df[senc_topics_df.astype(int).isin(top_k_topics)].copy()
top_k_sencs = top_k_sencs.fillna('')
top_k_sencs = top_k_sencs.stack().reset_index(drop=True)
top_k_sencs = top_k_sencs[top_k_sencs.str.len() > 0]
top_k_sencs.to_frame().to_csv(top_k_sencs_filename, index=False)
top_k_sencs
369 Andy Rooney once said and I quote "everyone wa...
403 At the beginning of my second year of college,...
445 The halls I used to comfortably navigate, I no...
490 Still struggling with my mental health, I want...
538 I want a quality education so I can to help hi...
...
8183 Thirdly, I began to plan and use my time wisely
8216 With so many unknowns and contradicting resear...
8258 In high school, I found it easy to form relati...
8320 Additionally, I have enrolled in my general ed...
8570 I am continuing to strive to new heights daily...
Length: 109, dtype: object